/*
 *   This program is free software: you can redistribute it and/or modify
 *   it under the terms of the GNU General Public License as published by
 *   the Free Software Foundation, either version 3 of the License, or
 *   (at your option) any later version.
 *
 *   This program is distributed in the hope that it will be useful,
 *   but WITHOUT ANY WARRANTY; without even the implied warranty of
 *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *   GNU General Public License for more details.
 *
 *   You should have received a copy of the GNU General Public License
 *   along with this program.  If not, see <http://www.gnu.org/licenses/>.
 */

/*
 *    ReservoirSample.java
 *    Copyright (C) 2007-2012 University of Waikato, Hamilton, New Zealand
 *
 */

package weka.filters.unsupervised.instance;

import java.io.IOException;
import java.io.StringReader;
import java.util.Enumeration;
import java.util.Random;
import java.util.Vector;

import weka.core.Capabilities;
import weka.core.Capabilities.Capability;
import weka.core.Instance;
import weka.core.Instances;
import weka.core.Option;
import weka.core.OptionHandler;
import weka.core.Randomizable;
import weka.core.Utils;
import weka.core.WeightedAttributesHandler;
import weka.filters.Filter;
import weka.filters.StreamableFilter;
import weka.filters.UnsupervisedFilter;

/**
 * <!-- globalinfo-start --> Produces a random subsample of a dataset using the
 * reservoir sampling Algorithm "R" by Vitter. The original data set does not
 * have to fit into main memory, but the reservoir does.
 * <p/>
 * <!-- globalinfo-end -->
 * 
 * <!-- technical-bibtex-start --> BibTeX:
 * 
 * <pre>
 * &#64;article{Vitter1985,
 *    author = {J. S. Vitter},
 *    journal = {ACM Transactions on Mathematical Software},
 *    number = {1}
 *    volume = {11}
 *    pages = {37-57},
 *    title = {Random Sampling with a Reservoir},
 *    year = {1985}
 * }
 * </pre>
 * 
 * </p>
 * <!-- options-start --> Valid options are:
 * <p/>
 * 
 * <pre>
 * -S &lt;num&gt;
 *  Specify the random number seed (default 1)
 * </pre>
 * 
 * <pre>
 * -Z &lt;num&gt;
 *  The size of the output dataset - number of instances
 *  (default 100)
 * </pre>
 * 
 * <!-- options-end -->
 * 
 * @author Mark Hall (mhall{[at]}pentaho{[dot]}org)
 * @version $Revision$
 */
public class ReservoirSample extends Filter implements UnsupervisedFilter, OptionHandler, StreamableFilter, Randomizable, WeightedAttributesHandler {

    /** for serialization */
    static final long serialVersionUID = 3119607037607101160L;

    /** The subsample size, number of instances% */
    protected int m_SampleSize = 100;

    /** Holds the sub-sample (reservoir) */
    protected Object[] m_subSample;

    /** The current instance being processed */
    protected int m_currentInst;

    /** The random number generator seed */
    protected int m_RandomSeed = 1;

    /** The random number generator */
    protected Random m_random;

    /** True if the incoming data contains string attributes */
    protected boolean m_containsStringAtts;

    /**
     * Returns a string describing this filter
     * 
     * @return a description of the classifier suitable for displaying in the
     *         explorer/experimenter gui
     */
    public String globalInfo() {
        return "Produces a random subsample of a dataset using the reservoir sampling " + "Algorithm \"R\" by Vitter. The original data set does not have to fit " + "into main memory, but the reservoir does. ";
    }

    /**
     * Returns an enumeration describing the available options.
     * 
     * @return an enumeration of all the available options.
     */
    @Override
    public Enumeration<Option> listOptions() {

        Vector<Option> result = new Vector<Option>();

        result.addElement(new Option("\tSpecify the random number seed (default 1)", "S", 1, "-S <num>"));

        result.addElement(new Option("\tThe size of the output dataset - number of instances\n" + "\t(default 100)", "Z", 1, "-Z <num>"));

        return result.elements();
    }

    /**
     * Parses a given list of options.
     * <p/>
     * 
     * <!-- options-start --> Valid options are:
     * <p/>
     * 
     * <pre>
     * -S &lt;num&gt;
     *  Specify the random number seed (default 1)
     * </pre>
     * 
     * <pre>
     * -Z &lt;num&gt;
     *  The size of the output dataset - number of instances
     *  (default 100)
     * </pre>
     * 
     * <!-- options-end -->
     * 
     * @param options the list of options as an array of strings
     * @throws Exception if an option is not supported
     */
    @Override
    public void setOptions(String[] options) throws Exception {

        String tmpStr = Utils.getOption('S', options);
        if (tmpStr.length() != 0) {
            setRandomSeed(Integer.parseInt(tmpStr));
        } else {
            setRandomSeed(1);
        }

        tmpStr = Utils.getOption('Z', options);
        if (tmpStr.length() != 0) {
            setSampleSize(Integer.parseInt(tmpStr));
        } else {
            setSampleSize(100);
        }

        Utils.checkForRemainingOptions(options);
    }

    /**
     * Gets the current settings of the filter.
     * 
     * @return an array of strings suitable for passing to setOptions
     */
    @Override
    public String[] getOptions() {

        Vector<String> result = new Vector<String>();

        result.add("-S");
        result.add("" + getRandomSeed());

        result.add("-Z");
        result.add("" + getSampleSize());

        return result.toArray(new String[result.size()]);
    }

    /**
     * Returns the tip text for this property
     * 
     * @return tip text for this property suitable for displaying in the
     *         explorer/experimenter gui
     */
    public String randomSeedTipText() {
        return "The seed used for random sampling.";
    }

    /**
     * Gets the random number seed.
     * 
     * @return the random number seed.
     */
    public int getRandomSeed() {
        return m_RandomSeed;
    }

    /**
     * Sets the random number seed.
     * 
     * @param newSeed the new random number seed.
     */
    public void setRandomSeed(int newSeed) {
        m_RandomSeed = newSeed;
    }

    public void setSeed(int seed) {
        setRandomSeed(seed);
    }

    public int getSeed() {
        return getRandomSeed();
    }

    /**
     * Returns the tip text for this property
     * 
     * @return tip text for this property suitable for displaying in the
     *         explorer/experimenter gui
     */
    public String sampleSizeTipText() {
        return "Size of the subsample (reservoir). i.e. the number of instances.";
    }

    /**
     * Gets the subsample size.
     * 
     * @return the subsample size
     */
    public int getSampleSize() {
        return m_SampleSize;
    }

    /**
     * Sets the size of the subsample.
     * 
     * @param newSampleSize size of the subsample.
     */
    public void setSampleSize(int newSampleSize) {
        m_SampleSize = newSampleSize;
    }

    /**
     * Returns the Capabilities of this filter.
     * 
     * @return the capabilities of this object
     * @see Capabilities
     */
    @Override
    public Capabilities getCapabilities() {
        Capabilities result = super.getCapabilities();
        result.disableAll();

        // attributes
        result.enableAllAttributes();
        result.enable(Capability.MISSING_VALUES);

        // class
        result.enableAllClasses();
        result.enable(Capability.MISSING_CLASS_VALUES);
        result.enable(Capability.NO_CLASS);

        return result;
    }

    /**
     * Sets the format of the input instances.
     * 
     * @param instanceInfo an Instances object containing the input instance
     *                     structure (any instances contained in the object are
     *                     ignored - only the structure is required).
     * @return true if the outputFormat may be collected immediately
     * @throws Exception if the input format can't be set successfully
     */
    @Override
    public boolean setInputFormat(Instances instanceInfo) throws Exception {

        super.setInputFormat(instanceInfo);
        m_containsStringAtts = instanceInfo.checkForStringAttributes();

        setOutputFormat(instanceInfo);

        m_subSample = new Object[m_SampleSize];
        m_currentInst = 0;
        m_random = new Random(m_RandomSeed);

        return true;
    }

    /**
     * Decides whether the current instance gets retained in the reservoir.
     * 
     * @param instance the Instance to potentially retain
     */
    protected void processInstance(Instance instance) {
        if (m_currentInst < m_SampleSize) {
            m_subSample[m_currentInst] = m_containsStringAtts ? instance.toString() : instance.copy();
        } else {
            double r = m_random.nextDouble();
            if (r < ((double) m_SampleSize / (double) m_currentInst)) {
                r = m_random.nextDouble();
                int replace = (int) (m_SampleSize * r);
                m_subSample[replace] = m_containsStringAtts ? instance.toString() : instance.copy();
            }
        }
        m_currentInst++;
    }

    /**
     * Input an instance for filtering. Filter requires all training instances be
     * read before producing output.
     * 
     * @param instance the input instance
     * @return true if the filtered instance may now be collected with output().
     * @throws IllegalStateException if no input structure has been defined
     */
    @Override
    public boolean input(Instance instance) {

        if (getInputFormat() == null) {
            throw new IllegalStateException("No input instance format defined");
        }
        if (m_NewBatch) {
            resetQueue();
            m_NewBatch = false;
        }
        if (isFirstBatchDone()) {
            push(instance);
            return true;
        } else {
            // bufferInput(instance);
            if (!m_containsStringAtts) {
                copyValues(instance, false);
            }
            processInstance(instance);
            return false;
        }
    }

    /**
     * Signify that this batch of input to the filter is finished. If the filter
     * requires all instances prior to filtering, output() may now be called to
     * retrieve the filtered instances.
     * 
     * @return true if there are instances pending output
     * @throws IllegalStateException if no input structure has been defined
     */
    @Override
    public boolean batchFinished() {

        if (getInputFormat() == null) {
            throw new IllegalStateException("No input instance format defined");
        }

        if (!isFirstBatchDone()) {
            // Do the subsample, and clear the input instances.
            createSubsample();
        }
        flushInput();

        m_NewBatch = true;
        m_FirstBatchDone = true;
        return (numPendingOutput() != 0);
    }

    /**
     * Creates a subsample of the current set of input instances. The output
     * instances are pushed onto the output queue for collection.
     */
    protected void createSubsample() {

        StringBuilder sb = null;
        if (m_containsStringAtts) {
            sb = new StringBuilder();
            sb.append(getInputFormat().stringFreeStructure()).append("\n");
        }
        for (int i = 0; i < m_SampleSize; i++) {
            if (m_subSample[i] != null) {
                if (!m_containsStringAtts) {
                    Instance copy = (Instance) ((Instance) m_subSample[i]).copy();
                    push(copy, false); // No need to copy instance
                } else {
                    sb.append(m_subSample[i].toString()).append("\n");
                }
            } else {
                // less data in the original than was asked for
                // as a sample.
                break;
            }
        }
        m_subSample = null;

        if (m_containsStringAtts) {
            try {
                Instances stringSample = new Instances(new StringReader(sb.toString()));
                for (int i = 0; i < stringSample.numInstances(); i++) {
                    push(stringSample.instance(i), false);
                }
            } catch (IOException ex) {
                throw new RuntimeException(ex);
            }
        }
    }

    /**
     * Main method for testing this class.
     * 
     * @param argv should contain arguments to the filter: use -h for help
     */
    public static void main(String[] argv) {
        runFilter(new ReservoirSample(), argv);
    }
}
